{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 80
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2391,
     "status": "ok",
     "timestamp": 1584605487305,
     "user": {
      "displayName": "Arun Singh",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GiLB1oZWi0vJw-6NUEFnpDZhxuDlv90nmX88O0y-A=s64",
      "userId": "01425536422013553450"
     },
     "user_tz": -330
    },
    "id": "TFw1eQjUEWf3",
    "outputId": "6e757c42-31bc-4f8b-940d-bcf6d24643a5"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<p style=\"color: red;\">\n",
       "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\n",
       "We recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \n",
       "or ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n",
       "<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "## Idea for this code is taken from the kernel https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s\n",
    "## Thanks to the authors for their elegant idea of using MLPs for this problem.\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import os\n",
    "import gc\n",
    "import time\n",
    "from datetime import datetime\n",
    "\n",
    "from contextlib import contextmanager\n",
    "\n",
    "import keras as ks\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy\n",
    "import tensorflow as tf\n",
    "from keras.models import load_model\n",
    "\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf\n",
    "\n",
    "\n",
    "from scipy.sparse import hstack\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import mean_squared_log_error as msle\n",
    "from sklearn.model_selection import KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2357,
     "status": "ok",
     "timestamp": 1584605487309,
     "user": {
      "displayName": "Arun Singh",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GiLB1oZWi0vJw-6NUEFnpDZhxuDlv90nmX88O0y-A=s64",
      "userId": "01425536422013553450"
     },
     "user_tz": -330
    },
    "id": "-XxpgEDxCKI7",
    "outputId": "ce8ea9d8-620b-4829-abb4-155a8402c1bb"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
     ]
    }
   ],
   "source": [
    "from google.colab import drive\n",
    "drive.mount('/content/drive')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "I7MVOrHCtyXx"
   },
   "outputs": [],
   "source": [
    "os.chdir('/content/drive/My Drive/Colab Notebooks/Mercari_NN')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "xIFOK51BFIac"
   },
   "outputs": [],
   "source": [
    "@contextmanager\n",
    "def timer(name):\n",
    "    t0 = time.time()\n",
    "    yield\n",
    "    print(f'[{name}] done in {time.time() - t0:.0f} s')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Gh6Ihc0RFWOx"
   },
   "outputs": [],
   "source": [
    "def preprocess(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    df['name'] = df['name'].fillna('') + ' ' + df['brand_name'].fillna('')\n",
    "    df['text'] = df['item_description'].fillna('') + ' ' + df['name'] + ' ' + df['category_name'].fillna('')\n",
    "    return df[['name', 'text', 'shipping', 'item_condition_id']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "Eh1OxQLYvJiZ"
   },
   "source": [
    "### Loading train data and splitting for Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "X0UwvIs9H1x6"
   },
   "outputs": [],
   "source": [
    "def load_train():\n",
    "    \n",
    "    train = pd.read_table('train.tsv')\n",
    "    train = train[train['price'] > 0].reset_index(drop=True)\n",
    "    cv = KFold(n_splits=20, shuffle=True, random_state=42)\n",
    "    train_ids, valid_ids = next(cv.split(train))\n",
    "    train, valid = train.iloc[train_ids], train.iloc[valid_ids]\n",
    "\n",
    "    global y_scaler\n",
    "    y_scaler = StandardScaler()\n",
    "\n",
    "    global train_price, valid_price \n",
    "    train_price = train['price'].values.reshape(-1, 1)\n",
    "    valid_price = valid['price'].values.reshape(-1, 1)\n",
    "\n",
    "    y_train = y_scaler.fit_transform(np.log1p(train_price))\n",
    "    \n",
    "    y_valid = y_scaler.transform(np.log1p(valid_price))\n",
    "\n",
    "    return train, valid, y_train, y_valid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "nUvVUqsqvSlN"
   },
   "source": [
    "### Pre-processing and featurization of train data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "-L3tj2JkJvHT"
   },
   "outputs": [],
   "source": [
    "def process_train(train, valid):\n",
    "    \n",
    "    global vectorizer1, vectorizer2, vectorizer3, vectorizer4\n",
    "    with timer('process train'):\n",
    "        train = preprocess(train)\n",
    "        vectorizer1 = Tfidf(max_features=100000, token_pattern='\\w+', dtype=np.float32)\n",
    "        train_namevec  = vectorizer1.fit_transform(train['name'].values)\n",
    "\n",
    "        vectorizer2 = Tfidf(max_features=100000, token_pattern='\\w+', ngram_range=(1, 2), dtype=np.float32)\n",
    "        train_textvec  = vectorizer2.fit_transform(train['text'].values)\n",
    "\n",
    "        vectorizer3 = OneHotEncoder(dtype=np.float32)\n",
    "        train_shipvec = vectorizer3.fit_transform(train['shipping'].values.reshape(-1, 1))\n",
    "\n",
    "        vectorizer4 = OneHotEncoder(dtype=np.float32)\n",
    "        train_conditionvec = vectorizer4.fit_transform(train['item_condition_id'].values.reshape(-1, 1))\n",
    "\n",
    "        X_train = hstack((train_namevec, train_textvec, train_shipvec, train_conditionvec)).tocsr()\n",
    "\n",
    "    with timer('process valid'):\n",
    "        valid = preprocess(valid)\n",
    "\n",
    "        valid_namevec  = vectorizer1.transform(valid['name'].values)\n",
    "\n",
    "        valid_textvec  = vectorizer2.transform(valid['text'].values)\n",
    "\n",
    "        valid_shipvec = vectorizer3.transform(valid['shipping'].values.reshape(-1, 1))\n",
    "\n",
    "        valid_conditionvec = vectorizer4.transform(valid['item_condition_id'].values.reshape(-1, 1))\n",
    "\n",
    "        X_valid = hstack((valid_namevec, valid_textvec, valid_shipvec, valid_conditionvec)).tocsr()\n",
    "    \n",
    "        # Binarizing input\n",
    "        Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]\n",
    "        \n",
    "    \n",
    "    return X_train, X_valid, Xb_train, Xb_valid "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "Ma_rAHPzpsVX"
   },
   "source": [
    "### Processing test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "W9UropkmptwL"
   },
   "outputs": [],
   "source": [
    "def load_process_test():\n",
    "\n",
    "    test = pd.read_table('test.tsv')\n",
    "\n",
    "    global predictions\n",
    "    predictions = pd.DataFrame(test['test_id'])\n",
    "\n",
    "    with timer('process test'):\n",
    "        test = preprocess(test)\n",
    "\n",
    "        test_namevec  = vectorizer1.transform(test['name'].values)\n",
    "\n",
    "        test_textvec  = vectorizer2.transform(test['text'].values)\n",
    "\n",
    "        test_shipvec = vectorizer3.transform(test['shipping'].values.reshape(-1, 1))\n",
    "\n",
    "        test_conditionvec = vectorizer4.transform(test['item_condition_id'].values.reshape(-1, 1))\n",
    "\n",
    "        X_test = hstack((test_namevec, test_textvec, test_shipvec, test_conditionvec)).tocsr()\n",
    "    \n",
    "        # Binarizing input\n",
    "        Xb_test = X_test.astype(np.bool).astype(np.float32)\n",
    "    \n",
    "    return X_test, Xb_test\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "WTd__2dhgMKC"
   },
   "source": [
    "### Model 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "O4hehORYbtUE"
   },
   "outputs": [],
   "source": [
    "def run_model1(X_train, y_train, X_valid, y_valid):\n",
    "    '''- returns an MLP model trained on tfidf vectorized sparse input.\n",
    "    - Does not perform best on binarized input.\n",
    "    - Uses Adam optimizer with constant learning rate. \n",
    "    - trains 2 epochs, Batch size is doubled at every epoch to speed up the optimization'''\n",
    "\n",
    "    model_in = ks.Input(shape=(X_train.shape[1],), dtype='float32', sparse=True)\n",
    "    out = ks.layers.Dense(256, activation='relu')(model_in)\n",
    "    # out = ks.layers.Dropout(0.1)(out)     ## performance is better without dropouts\n",
    "    out = ks.layers.Dense(64, activation='relu')(out)\n",
    "    # out = ks.layers.Dropout(0.1)(out)\n",
    "    out = ks.layers.Dense(64, activation='relu')(out)\n",
    "    # out = ks.layers.Dropout(0.2)(out)\n",
    "    out = ks.layers.Dense(32, activation='relu')(out)\n",
    "    out = ks.layers.Dense(1)(out)\n",
    "    model = ks.Model(model_in, out)\n",
    "    \n",
    "    model.compile(loss='mean_squared_error', optimizer=ks.optimizers.Adam(lr=3e-3))\n",
    "    for i in range(2):\n",
    "        with timer(f'epoch {i + 1}'):\n",
    "            model.fit(x=X_train, y=y_train, batch_size=2**(9 + i), epochs=1, verbose=1, validation_data=(X_valid, y_valid))\n",
    "    \n",
    "    return model\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "gJWQzK7EhA9r"
   },
   "source": [
    "### Model 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "TzheMNU-hA9u"
   },
   "outputs": [],
   "source": [
    "def run_model2(Xb_train, y_train, Xb_valid, y_valid):\n",
    "    '''- returns an MLP model trained on binarized sparse input.\n",
    "    - Does not perform best on non-binarized(regular) input.\n",
    "    - Uses Adam optimizer with constant learning rate. \n",
    "    - trains 3 epochs, Batch size is doubled at every epoch to speed up the optimization'''\n",
    "    \n",
    "    model_in = ks.Input(shape=(Xb_train.shape[1],), dtype='float32', sparse=True)\n",
    "    out = ks.layers.Dense(256, activation='relu')(model_in)\n",
    "    # out = ks.layers.Dropout(0.1)(out)     ## performance is better without dropouts\n",
    "    out = ks.layers.Dense(64, activation='relu')(out)\n",
    "    # out = ks.layers.Dropout(0.1)(out)\n",
    "    out = ks.layers.Dense(64, activation='relu')(out)\n",
    "    # out = ks.layers.Dropout(0.2)(out)\n",
    "    out = ks.layers.Dense(32, activation='relu')(out)\n",
    "    out = ks.layers.Dense(1)(out)\n",
    "    model = ks.Model(model_in, out)\n",
    "    \n",
    "    model.compile(loss='mean_squared_error', optimizer=ks.optimizers.Adam(lr=3e-3))\n",
    "    for i in range(3):\n",
    "        with timer(f'epoch {i + 1}'):\n",
    "            model.fit(x=Xb_train, y=y_train, batch_size=2**(9 + i), epochs=1, verbose=1, validation_data=(Xb_valid, y_valid))\n",
    "    \n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "Mz0hXLHk_Jw9"
   },
   "source": [
    "### Main function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 632767,
     "status": "ok",
     "timestamp": 1584606117896,
     "user": {
      "displayName": "Arun Singh",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GiLB1oZWi0vJw-6NUEFnpDZhxuDlv90nmX88O0y-A=s64",
      "userId": "01425536422013553450"
     },
     "user_tz": -330
    },
    "id": "bb_0xeOa9_p3",
    "outputId": "7f34950d-293c-4282-c40c-7cd7133bba6f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Loading and processing train data.....\n",
      "[process train] done in 152 s\n",
      "[process valid] done in 7 s\n",
      "(1407577, 200007) (1407577, 1) (74084, 200007) (74084, 1)\n",
      "\n",
      "\n",
      "Running model on regular (non-binary) input.....\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:539: The name tf.sparse_placeholder is deprecated. Please use tf.compat.v1.sparse_placeholder instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1133: The name tf.sparse_tensor_dense_matmul is deprecated. Please use tf.sparse.sparse_dense_matmul instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1020: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3005: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n",
      "\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:197: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:207: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:216: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.\n",
      "\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:223: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.\n",
      "\n",
      "1407577/1407577 [==============================] - 64s 45us/step - loss: 0.3439 - val_loss: 0.2992\n",
      "[epoch 1] done in 64 s\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 32s 23us/step - loss: 0.2029 - val_loss: 0.2910\n",
      "[epoch 2] done in 32 s\n",
      "Model: \"model_1\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_1 (InputLayer)         (None, 200007)            0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 256)               51202048  \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 64)                16448     \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 64)                4160      \n",
      "_________________________________________________________________\n",
      "dense_4 (Dense)              (None, 32)                2080      \n",
      "_________________________________________________________________\n",
      "dense_5 (Dense)              (None, 1)                 33        \n",
      "=================================================================\n",
      "Total params: 51,224,769\n",
      "Trainable params: 51,224,769\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "1st run val RMSLE: 0.4023\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 63s 45us/step - loss: 0.3438 - val_loss: 0.3002\n",
      "[epoch 1] done in 63 s\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 33s 23us/step - loss: 0.2021 - val_loss: 0.2871\n",
      "[epoch 2] done in 33 s\n",
      "Model: \"model_2\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_2 (InputLayer)         (None, 200007)            0         \n",
      "_________________________________________________________________\n",
      "dense_6 (Dense)              (None, 256)               51202048  \n",
      "_________________________________________________________________\n",
      "dense_7 (Dense)              (None, 64)                16448     \n",
      "_________________________________________________________________\n",
      "dense_8 (Dense)              (None, 64)                4160      \n",
      "_________________________________________________________________\n",
      "dense_9 (Dense)              (None, 32)                2080      \n",
      "_________________________________________________________________\n",
      "dense_10 (Dense)             (None, 1)                 33        \n",
      "=================================================================\n",
      "Total params: 51,224,769\n",
      "Trainable params: 51,224,769\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "2nd run val RMSLE: 0.3996\n",
      "\n",
      "\n",
      "Running model on binarized input.....\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 63s 45us/step - loss: 0.3500 - val_loss: 0.3060\n",
      "[epoch 1] done in 63 s\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 32s 23us/step - loss: 0.2077 - val_loss: 0.2934\n",
      "[epoch 2] done in 32 s\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 18s 13us/step - loss: 0.1187 - val_loss: 0.2949\n",
      "[epoch 3] done in 18 s\n",
      "Model: \"model_3\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_3 (InputLayer)         (None, 200007)            0         \n",
      "_________________________________________________________________\n",
      "dense_11 (Dense)             (None, 256)               51202048  \n",
      "_________________________________________________________________\n",
      "dense_12 (Dense)             (None, 64)                16448     \n",
      "_________________________________________________________________\n",
      "dense_13 (Dense)             (None, 64)                4160      \n",
      "_________________________________________________________________\n",
      "dense_14 (Dense)             (None, 32)                2080      \n",
      "_________________________________________________________________\n",
      "dense_15 (Dense)             (None, 1)                 33        \n",
      "=================================================================\n",
      "Total params: 51,224,769\n",
      "Trainable params: 51,224,769\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "3rd run val RMSLE: 0.4050\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 65s 46us/step - loss: 0.3501 - val_loss: 0.3043\n",
      "[epoch 1] done in 66 s\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 37s 26us/step - loss: 0.2089 - val_loss: 0.2937\n",
      "[epoch 2] done in 37 s\n",
      "Train on 1407577 samples, validate on 74084 samples\n",
      "Epoch 1/1\n",
      "1407577/1407577 [==============================] - 19s 14us/step - loss: 0.1202 - val_loss: 0.2958\n",
      "[epoch 3] done in 19 s\n",
      "Model: \"model_4\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_4 (InputLayer)         (None, 200007)            0         \n",
      "_________________________________________________________________\n",
      "dense_16 (Dense)             (None, 256)               51202048  \n",
      "_________________________________________________________________\n",
      "dense_17 (Dense)             (None, 64)                16448     \n",
      "_________________________________________________________________\n",
      "dense_18 (Dense)             (None, 64)                4160      \n",
      "_________________________________________________________________\n",
      "dense_19 (Dense)             (None, 32)                2080      \n",
      "_________________________________________________________________\n",
      "dense_20 (Dense)             (None, 1)                 33        \n",
      "=================================================================\n",
      "Total params: 51,224,769\n",
      "Trainable params: 51,224,769\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "4th run val RMSLE: 0.4056\n",
      "\n",
      "\n",
      "Ensemble (weighted average of predictions from 4 models/runs).....\n",
      "Final valid RMSLE: 0.3848\n",
      "Code finished execution in 0:10:29.996681\n"
     ]
    }
   ],
   "source": [
    "DEVELOP = True # Set to True for only trainng and validation\n",
    "\n",
    "def main():\n",
    "\n",
    "    start_time = datetime.now()\n",
    "\n",
    "    print('\\n\\nLoading and processing train data.....')\n",
    "    train, valid, y_train, y_valid = load_train()\n",
    "\n",
    "    X_train, X_valid, Xb_train, Xb_valid = process_train(train, valid)\n",
    "    print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)\n",
    "\n",
    "    del train, valid\n",
    "    gc.collect()\n",
    "\n",
    "    ## Running model1 on regualr data X_train, X_valid\n",
    "    print('\\n\\nRunning model on regular (non-binary) input.....')\n",
    "    model1 = run_model1(X_train, y_train, X_valid, y_valid)\n",
    "    model1.save('model1.h5')\n",
    "    model1 = load_model('model1.h5')\n",
    "    model1.summary()\n",
    "    pred1 = model1.predict(X_valid)[:, 0]\n",
    "\n",
    "    y_pred = np.expm1(y_scaler.inverse_transform(pred1.reshape(-1, 1))[:, 0])\n",
    "    print('1st run val RMSLE: {:.4f}'.format(np.sqrt(msle(valid_price, y_pred))))\n",
    "    \n",
    "    ## Running again\n",
    "    model2 = run_model1(X_train, y_train, X_valid, y_valid)\n",
    "    model2.save('model2.h5')\n",
    "    model2 = load_model('model2.h5')\n",
    "    model2.summary()\n",
    "    pred2 = model2.predict(X_valid)[:, 0]\n",
    "\n",
    "    y_pred = np.expm1(y_scaler.inverse_transform(pred2.reshape(-1, 1))[:, 0])\n",
    "    print('2nd run val RMSLE: {:.4f}'.format(np.sqrt(msle(valid_price, y_pred))))\n",
    "\n",
    "    ## Running model2 on binarized data Xb_train, Xb_valid\n",
    "    print('\\n\\nRunning model on binarized input.....')\n",
    "    model3 = run_model2(Xb_train, y_train, Xb_valid, y_valid)\n",
    "    model3.save('model3.h5')\n",
    "    model3 = load_model('model3.h5')\n",
    "    model3.summary()\n",
    "    pred3 = model3.predict(Xb_valid)[:, 0]\n",
    "\n",
    "    y_pred = np.expm1(y_scaler.inverse_transform(pred3.reshape(-1, 1))[:, 0])\n",
    "    print('3rd run val RMSLE: {:.4f}'.format(np.sqrt(msle(valid_price, y_pred))))\n",
    "    \n",
    "    ## Running again\n",
    "    model4 = run_model2(Xb_train, y_train, Xb_valid, y_valid)\n",
    "    model4.save('model4.h5')\n",
    "    model4 = load_model('model4.h5')\n",
    "    model4.summary()\n",
    "    pred4 = model4.predict(Xb_valid)[:, 0]\n",
    "\n",
    "    y_pred = np.expm1(y_scaler.inverse_transform(pred4.reshape(-1, 1))[:, 0])\n",
    "    print('4th run val RMSLE: {:.4f}'.format(np.sqrt(msle(valid_price, y_pred))))\n",
    "\n",
    "\n",
    "    ## Final Prediction = weighted average of predictions of 4 models/runs\n",
    "    print('\\n\\nEnsemble (weighted average of predictions from 4 models/runs).....')\n",
    "    y_pred = np.average([pred1, pred2, pred3, pred4], weights=[0.33, 0.33, 0.17, 0.17], axis=0)\n",
    "    y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0])\n",
    "    print('Final valid RMSLE: {:.4f}'.format(np.sqrt(msle(valid_price, y_pred))))\n",
    "\n",
    "    if DEVELOP==False:\n",
    "        ## This block loads and predicts on test data if DEVELOP is not set\n",
    "\n",
    "        # print('\\n\\nLoading and processing test data.....')\n",
    "        X_test, Xb_test = load_process_test()\n",
    "        print(X_test.shape, Xb_test.shape)\n",
    "\n",
    "        with timer('predict test'):\n",
    "            test_pred1 = model1.predict(X_test)[:, 0]\n",
    "            test_pred2 = model2.predict(X_test)[:, 0]\n",
    "            test_pred3 = model3.predict(Xb_test)[:, 0]\n",
    "            test_pred4 = model4.predict(Xb_test)[:, 0]\n",
    "\n",
    "\n",
    "        test_pred = np.average([test_pred1, test_pred2, test_pred3, test_pred4], weights=[0.33, 0.33, 0.17, 0.17], axis=0)\n",
    "        test_pred = np.expm1(y_scaler.inverse_transform(test_pred.reshape(-1, 1))[:, 0])\n",
    "\n",
    "        print('\\n\\nCreating submisssion file.....')\n",
    "        predictions['price'] = test_pred\n",
    "\n",
    "        predictions.to_csv('predictions.csv', index=False)\n",
    "\n",
    "\n",
    "    print(f'Code finished execution in {datetime.now() - start_time}')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "n_P0mJGNJby8"
   },
   "source": [
    "- **Compared to the original code, I have made some changes in the MLP architecture as well as parameters such as learning rate and batch size to get better results.**\n",
    "\n",
    "- **I also tried using dropouts(0.1, 0.2, 0.3, .. 0.5) but the models performed better without dropouts (I got a validation RMSLE of 0.3872 with dropouts). So I have removed dropouts in the final code.**\n",
    "\n",
    "- **I also experimented with diffrent activation units ('tanh', 'sigmoid', 'linear', 'relu'). 'relu' performs significantly better that rest all.**\n",
    "\n",
    "- **I have also changed the number of epochs from 3 to 2 for model1 (non-binary data), as the model starts overfitting from 3rd epoch.**\n",
    "\n",
    "- **Instead of taking simple mean, I have taken weighted average of predictions from 4 different models/runs.**\n",
    "\n",
    "- **For simplicity of the code, and also because I have used Google Colab(training is faster with GPUs than with multi-core CPUs), I have trained the models one after another unlike pool processing in the original kernel. The code finishes running in decent amount of time on Colab.**\n",
    "\n",
    "- **The validation RMSLE I got was 0.3848 as compared to 0.3875 in the source kernel.**"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyNR52kHrFkP/u3D2s9JFdag",
   "collapsed_sections": [],
   "machine_shape": "hm",
   "name": "Mercari_MLP_0.3848.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
